# General information
 
This file does the following:

1) Imports and prepares the CFS dataset, which contains the state to state trade flows.

2) Uses CFS plus proportionality to calculate state-state trade flows in manufacturing $X_{ij,k}$ for $i\in US\;  \&\;   j \in US$

## Input files

1. `0-Raw_Data/CFS/NAICS to CFS Crosswalk.xlsx`
2. `0-Raw_Data/CFS/CFS_2012_00A18_with_ann.csv`
3. `0-Raw_Data/CFS/CFS2007.xlsx`
4. `0-Raw_Data/CFS/CFS2002mine.xlsx`
5. `0-Raw_Data/CFS/CF1200A24.csv`
6. `0-Raw_Data/Fips/statefips.csv`
7. `0-Raw_Data/regions.csv`
8. `0-Raw_Data/sectors.csv`
9. `1-Intermediate_Processed_Data/country_country_step_.csv`

## Output files

1. `1-Intermediate_Processed_Data/CFSapportionment.csv`
2. `1-Intermediate_Processed_Data/CFS_Xijk.csv`
3. `1-Intermediate_Processed_Data/state_cfs_step_.csv`

```{r setup, include=FALSE}
knitr::opts_knit$set(root.dir = normalizePath(".."))
remove(list = ls())
```

# Importing and preparing the CFS dataset, and constructing commodity-sector redistribution matrix.

The following code associates each commodity code with a NAICS code and then transforms the latter in the final desired sector categories. It is important to note that each commodity can be associated with more than one NAICS, depending on the industry that that commodity was produced in.  
The code takes CFS 2007 and 2012 for the US, and calculates the proportion of the amount of commodity $c$ associated with NAICS $n$, with respect to the total amount of commodity $c$. This gives a matrix of (#commodities)x14 for each year. The code produces a matrix that shows how to redistribute the amount of each commodity into 13 sectors for the US for 2002, 2007 and 2012. The matrix for 2002 is the same as the one for 2007. 

```{r}
final <- c()
for (yr in c(2007, 2012)){
if (yr == 2007){
# Import CFS data with both NAICS and CFS codes
base <- read_excel(paste("0-Raw_Data/CFS/NAICS to CFS Crosswalk.xlsx", sep=""))
# Re-code manufacturing as NAICS 3 and de-string NAICS code
base <- base %>%
  filter(GEOTYPE == 1) %>%
  # Redistribution matrix for 2002 is the same as the one for 2007.
  dplyr::rename(NAICS2007 = NAICS2002) %>%
  dplyr::rename(NAICS2007_MEANING = NAICS2002_MEANING) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 == "31-33", "3",NAICS2007)) %>%
  mutate(NAICS2007 = as.numeric(NAICS2007)) %>%
  dplyr::select(NAICS2007, NAICS2007_MEANING, COMM, COMM_MEANING, VAL)
}
if (yr == 2012){
# Import CFS data with both NAICS and CFS codes.
base <- read.csv(paste("0-Raw_Data/CFS//CFS_2012_00A18_with_ann.csv", sep=""), header = TRUE, sep = ",", dec = ".")
base <- base[(2:dim(base)[1]), ]
# Re-code manufacturing as NAICS 3 and de-string NAICS code
base <- base %>%
  dplyr::rename(NAICS2012=NAICS.id) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == "31-33", "3",NAICS2012)) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == "4931(CF1)", "4931",NAICS2012)) %>%
  mutate(NAICS2012 = as.numeric(NAICS2012))
# Keep just relevant variables: NAICS code, SCTG code, and value shipped
base <- base %>%
  dplyr::rename(NAICS2012_MEANING = NAICS.display.label) %>%
  dplyr::rename(COMM = COMM.id) %>%
  dplyr::rename(COMM_MEANING = COMM.display.label) %>%
  mutate(VAL = as.numeric(VAL)) %>%
  dplyr::select(NAICS2012, NAICS2012_MEANING, COMM, COMM_MEANING, VAL)
}

#Re-code some commodity categories. Drop "all commodities" category and "unknown commodity" category.
base <- base %>%
  mutate(COMM = as.character(COMM)) %>%
  mutate(COMM = ifelse(COMM == "07-R", "07", COMM)) %>%
  mutate(COMM = ifelse(COMM == "08-R", "08", COMM)) %>%
  mutate(COMM = ifelse(COMM == "17-R", "17", COMM)) %>%
  mutate(COMM = ifelse(COMM == "18-R", "18", COMM)) %>%
  mutate(COMM = as.numeric(COMM)) %>%
  filter(COMM != 0 & COMM != 99)
#Drop superfluous NAICS and turn them all into 3 digit numbers
if (yr == 2007){
base <- base %>%
  filter(NAICS2007 != 3 & NAICS2007 != 423 & NAICS2007 != 424) %>%
  filter(NAICS2007 <= 4230 | NAICS2007 >= 4250) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 == 42, 421, NAICS2007)) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 == 4541, 454, NAICS2007)) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 == 45431, 455, NAICS2007)) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 == 4931, 493, NAICS2007)) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 == 5111, 511, NAICS2007)) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 == 551114, 551, NAICS2007)) %>%
  #Map to 13 sectors.
  mutate(NAICS2007 = as.numeric(recode(NAICS2007, '551'="600", '311'="1", '312'="1", '313'="2", '314'="2", '315'="2", '316'="2", '321'="3", '322'="3", '323'="3", '324'="4", '325'="5", '326'="6", '327'="7", '331'="8", '332'="8", '333'="9", '334'="10", '335'="10", '336'="11", '337'="12", '339'="12", '421'="14", '454'="14", '455'="14", '493'="22", '511'="16", '551'="22"))) %>%
  filter(is.na(NAICS2007)==FALSE) %>%
  mutate(NAICS2007 = ifelse(NAICS2007 >=13 & NAICS2007 <= 22, 13, NAICS2007)) %>%
  dplyr::rename(cdp = NAICS2007) 
}
#Drop superfluous NAICS and turn them all into 3 digit numbers
if (yr == 2012){
base <- base %>%
  filter(NAICS2012 != 3 & NAICS2012 != 423 & NAICS2012 != 424) %>%
  filter(NAICS2012 <= 4230 | NAICS2012 >= 4250) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == 42, 421, NAICS2012)) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == 4541, 454, NAICS2012)) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == 45431, 455, NAICS2012)) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == 4931, 493, NAICS2012)) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == 5111, 511, NAICS2012)) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 == 551114, 551, NAICS2012)) %>%
  #Map to 13 sectors.
  mutate(NAICS2012 = as.numeric(recode(NAICS2012, '551'="600", '311'="1", '312'="1", '313'="2", '314'="2", '315'="2", '316'="2", '321'="3", '322'="3", '323'="3", '324'="4", '325'="5", '326'="6", '327'="7", '331'="8", '332'="8", '333'="9", '334'="10", '335'="10", '336'="11", '337'="12", '339'="12", '421'="14", '454'="14", '455'="14", '493'="22", '511'="16", '551'="22"))) %>%
  filter(is.na(NAICS2012)==FALSE) %>%
  mutate(NAICS2012 = ifelse(NAICS2012 >=13 & NAICS2012 <= 22, 13, NAICS2012)) %>%
  dplyr::rename(cdp = NAICS2012) 
}

#Proportion of the amount of commodity "c" associated with NAICS "n", with respect to the total amount of commodity "c".
base$cdp <- paste("portion", base$cdp, sep="")  
base <- base %>%
  group_by(cdp, COMM) %>% 
  #Replace missing values with zeros.
  mutate(VAL = ifelse(is.na(VAL)==TRUE, 0, VAL)) %>%
  mutate(VAL = sum(VAL)) %>%
  ungroup() %>%
  distinct(cdp, COMM, .keep_all = TRUE) %>%
  dplyr::select(COMM, cdp, VAL) %>%
  arrange(COMM, cdp, VAL) %>%
  group_by(COMM) %>%
  mutate(totalsent = sum(VAL)) %>%
  ungroup() %>%
  mutate(portion = VAL/totalsent) %>%
  mutate(portion = ifelse(is.na(portion)==TRUE, 0, portion)) %>%
  dplyr::select(-VAL, -totalsent)
#Reshape to matrix form.
base <- spread(base, cdp, portion, fill = NA, convert = TRUE,  drop = TRUE, sep = NULL)
base$year <- yr
base[is.na(base)==TRUE] <- 0

#Appending years.
if (yr==2007){
base1 <- base   
base1$year <- 2002
final <- rbind(final, base1, base)
}
if (yr==2012){final <- rbind(final, base)}
}
#select commodity categories and each sector portions.
final <- final %>% 
  dplyr::select(COMM, portion1, portion2, portion3, portion4, portion5, portion6, portion7, portion8, portion9, portion10, portion11, portion12, portion13, year) %>%
  arrange(year, COMM)
write.table(final, file = paste("1-Intermediate_Processed_Data/CFSapportionment.csv", sep=""), sep = ",", row.names = FALSE)


# * The 22 sectors in CDP are:
# 
# * 01) Food, Beverage, and Tobacco Products (NAICS 311-312); 
# * 02) Textile, Textile Product Mills, Apparel, Leather, and Allied Products (NAICS 313-316);
# * 03) Wood Products, Paper, Printing, and Related Support Activities (NAICS 321-323); 
# * 04) Petroleum and Coal Products (NAICS 324); 
# * 05) Chemical (NAICS 325); 
# * 06) Plastics and Rubber Products (NAICS 326); 
# * 07) Nonmetallic Mineral Products (NAICS 327);
# * 08) Primary Metal and Fabricated Metal Products (NAICS 331-332); 
# * 09) Machinery (NAICS 333);
# * 10) Computer and Electronic Products, and Electrical Equipment and Appliance (NAICS 334-335); 
# * 11) Transportation Equipment (NAICS 336); 
# * 12) Furniture and Related Products, and Miscellaneous Manufacturing (NAICS 337-339); 
# * 14) Trade (NAICS 42-45);
# * 13) Construction (NAICS 23);
# * 15) Transport Services (NAICS 481-488); 
# * 16) Information Services (NAICS 511-518);
# * 17) Finance and Insurance (NAICS 521-525); 
# * 18) Real Estate (NAICS 531-533); 
# * 19) Education (NAICS 61); 
# * 20) Health Care (NAICS 621-624); 
# * 21) Accommodation and Food Services (NAICS 721-722); 
# * 22) Other Services (NAICS 493, 541, 55, 561, 562, 711-713, 811-814);
# *23.	(NAICS 111-115) Agriculture, Forestry, Fishing, and Hunting (c1)
# *Mining, Quarrying, and Oil and Gas Extraction (c2).
```

## CFS state-to-state manufacturing trade flows

The following code assigns state-to-state trade flows from CFS to the NAICS level. We then calculate $X_{ij,k}^{CFS}$ for the 12 manufacturing sectors (where $X_{ij,k}^{CFS}$ is the sales of state $i$ to state $j$ in sector $k$ according to CFS). We calculate weights for 13 sectors here; however, sector 13 contains the NAICS sectors we want to delete (basically because they are part of agriculture and services); so in the last chunk we will keep only our 12 manufacturing sectors.

```{r}
#Matrix for redistribution.
CFSapportionment <- read.csv(paste("1-Intermediate_Processed_Data/CFSapportionment.csv", sep=""), header = TRUE, sep = ",", dec = ".")
final <- c()
#States listed as origins and as destinations.
statefips <- read.csv(paste("0-Raw_Data/Fips/statefips.csv", sep=""), header = TRUE, sep = ",", dec = ".")
list <- statefips$statename
orig <- statefips %>% 
  dplyr::rename(orig_state = statefip) %>%
  dplyr::rename(origin = statename)
dest <- statefips %>% 
  dplyr::rename(dest_state = statefip) %>%
  dplyr::rename(destination = statename) 

for (yr in c(2002, 2007, 2012)){
# * Import CFS data from original table
# *"STATE Table 22. Shipment Characteristics by Destination and Two-Digit Commodity for State of Origin: 2002"
# *"Origin State","Destination State","Code","SCTG (2-Digit)","Value ($mil) 02","Value % 02","Tons (thous) 02","Tons % 02","Ton-miles (mil) 02","Ton-miles % 02","Avg miles 02"  
if (yr == 2002){
base <- data.frame(read_excel(paste("0-Raw_Data/CFS/CFS2002mine.xlsx",sep=""), sheet = "sttbl1412281724", skip = 1))
# Renaming variables
base <- base %>%
  dplyr::rename(   origin = Origin.State, destination = Destination.State, commodity = Code, value = Value...mil..02)
base$origin <- gsub(".", "", base$origin, fixed = TRUE)
base$destination <- gsub(".", "", base$destination, fixed = TRUE)
base <- base %>%
  #From state to state only.
  filter(origin %in% list & destination %in% list) %>%
  mutate(value = as.numeric(value))
}
if (yr == 2007){
base <- read_excel(paste("0-Raw_Data/CFS/CFS2007.xlsx", sep=""), sheet = "CF0700A22")
# Renaming variables
base <- base %>%
  dplyr::rename(origin = GEOGRAPHY, destination = DDESTGEO_MEANING, commodity = COMM, value = VAL) %>%
  #From state to state only.
  filter(origin %in% list & destination %in% list)
}
if (yr == 2012){
base <- read.csv(paste("0-Raw_Data/CFS/CF1200A24.csv", sep=""), header = TRUE, sep = ",", dec = ".")  
# Renaming variables
base <- base %>%
  dplyr::rename(origin = geo_ttl, destination = ddestgeo_ttl, commodity = comm, value = val) %>%
  #From state to state only.
  filter(origin %in% list & destination %in% list)
}
#Convert origin/dest to fips codes  
base <- base %>%
  dplyr::select(origin, destination, value, commodity) %>%
  #Only states as origins.
  filter(origin != "United States" & origin != "Total" & origin != "District of Columbia") %>%
  #Only states as destinations.
  filter(destination != "United States" & destination != "Total" & destination != "District of Columbia") %>%
  mutate(commodity = as.character(commodity))
base$origin <- gsub(" ", "", base$origin, fixed = TRUE)
base$destination <- gsub(" ", "", base$destination, fixed = TRUE)

# Get two digit codes
base <- base %>% 
  mutate(code= as.numeric(substr(x=commodity, start=1, stop=2))) %>%
  dplyr::select(origin, destination, code, value) %>%
  dplyr::rename(COMM = code) %>%
  filter(COMM != 99 & COMM != 0) %>% #takes out NA as well
  mutate(year = yr) %>%
  #Merge with distribution matrix.
  left_join(CFSapportionment, by=c('COMM'='COMM', 'year'='year'))
#Reshape to long to ease the calculation of X_{ij,k}^{CFS}.
base <- melt(base, id.vars=c("origin", "destination", "COMM", "value", "year"), variable.name="portion", value.name = "portion_value")
# Calculate X_{ij,k}^{CFS}.
base <- base %>%
  mutate(value = ifelse(is.na(value) == TRUE, 0, value)) %>%
  mutate(portion_value = portion_value * value) %>%
  dplyr::select(-value, -COMM) %>%
  group_by(origin, destination, year, portion) %>%
  mutate(portion_value = sum(portion_value)) %>%
  ungroup() %>%
  distinct(origin, destination, year, portion, .keep_all = TRUE) %>%
  mutate(sector = as.numeric(substr(x=portion, start=8, stop=9))) %>%
  dplyr::select(-portion) %>%
  mutate(origin = paste("trade",origin, sep=""))
#Reshape to wide.
base <- spread(base, origin, portion_value, fill = 0, convert = TRUE,  drop = TRUE, sep = NULL)
base <- base %>%
  arrange(sector, destination) %>%
  dplyr::rename(importer = destination) %>%
  dplyr::select(year, everything())

#Appending years.
final <- rbind(final, base)
}

write.table(final, file = paste("1-Intermediate_Processed_Data/CFS_Xijk.csv", sep=""), sep = ",", row.names = FALSE)


```

# Final state-to-state manufacturing trade flows

Using CFS data we construct: 

$$x_{ij,k}^{CFS}\equiv\dfrac{X_{ij,k}^{CFS}}{\sum_{h}\sum_{l}X_{hl,k}^{CFS}},$$ 
Our object of interest is:

$$X_{ij,k}=x_{ij,k}^{CFS}X_{US,US,k}^{WIOD},$$ 
where $X_{US,US,k}^{WIOD}$ is the consumption of the US in sector $k$ that is produced in the US, according to WIOD. Note that proportionality implies 
$$\sum_{i} \sum_{j} X_{ij,k}=X_{US,US,k}^{WIOD}$$ for each manufacturing sector. 

## Importing datasets to R

```{r}
#CFS
cfs_base <- read.csv("1-Intermediate_Processed_Data//CFS_Xijk.csv")
#WIOD
wiod_base <- c()
for (yr in 2000:2007) {
  wiod <- country.country <- read.csv(paste('1-Intermediate_Processed_Data//country_country_step_', yr, '.csv', sep= ""), header = TRUE, sep = ",", dec = ".")
  wiod_base <- rbind(wiod_base, wiod)
}

#Number of states.
regions <- read.csv("0-Raw_Data/regions.csv")
s_names <- regions %>% filter(status=="state") 
s_names <- s_names$region
n_s <- length(s_names)
#Number of sectors minus 3 (we use it to exclude agriculture, services, and non-participants).
reclass_sectors <- read.csv("0-Raw_Data/sectors.csv")
n_sec <- length(unique(reclass_sectors$final_sector)) - 3
#Small value.
epsilon <- 0.0001
```

## Manufacture bilateral trade between US states

```{r calculations}

for (yr in 2000:2007) {

#We only have CFS weights for three years, but we have WIOD data for 2000-2007, so we associate each year's WIOD data to the nearest available year of CFS weights.
c=c(2002, 2007, 2012)
ap=1000
for (i in 1:3) {
  p=yr-c[i]
  if(abs(p)<ap) {
    aprox = p
    ap = abs(p)
  }
}
yr_ <- yr-aprox

#CFS
CFS <- subset(cfs_base, year == yr_)
CFS <- subset(CFS, select = -year )

#WIOD
wiod <- subset(wiod_base, year == yr)
wiod <- subset(wiod, select = -year)  
  
# Create data frame for X_{jk,k}^{CFS}
df <- data.frame(CFS)

# Dimensions of data frame
n_rows <- dim(df)[1]
n_columns <- dim(df)[2]

# Sectors
n_sectors <- length(unique(df$sector))

# Unique sectors
unique_sectors <- unique(df$sector)

# Create sum of X_{hl,k}^{CFS}
XijkCFS <- df[,3:n_columns]
sum_hl_XhlkCFS <- numeric(n_sectors)
for(k in 1:n_sectors) {
  sum_hl_XhlkCFS[k] <- sum(XijkCFS[df$sector==k,])
}
XijkCFS_1 <- c()
denominator <- c()
#Keep only the 12 manufacturing sectors.
for (i in 1:n_sec) {
  denominator <- rbind(denominator,matrix(replicate(n_s*n_s,sum_hl_XhlkCFS[i]), 
                                         nrow = n_s, ncol = n_s))
  XijkCFS_1 <- rbind(XijkCFS_1,XijkCFS[df$sector==i,])
}
# Create x_{ij,k}^{CFS}
xijkCFS <- XijkCFS_1/denominator  

# Create data frame for X_{US,US,k}^{WIOD}.
df_wiod <- data.frame(wiod)
# Ignore non-USA exporters
df_wiod <- df_wiod %>% dplyr::select(importer, sector, USA)
#Filter out non-USA importers
USA_exporter_indices <- df_wiod$importer == "USA"
# Take into account only the first 12 sectors 
X_USA_USA_k <- df_wiod$USA[USA_exporter_indices]
X_USA_USA_k <- X_USA_USA_k[1:n_sec]
Xijk <- c()
#Create X_{ij,k}
for(i in 1:n_sec) {
  Xijk <- rbind(Xijk, xijkCFS[((i-1)*n_s + 1):(n_s*i),]*X_USA_USA_k[i])
}

#Appropriate columns of sector and importer. And arrange. 
Xijk$sector <- df$sector[1:(n_sec*n_s)]
Xijk$importer <- df$importer[1:(n_sec*n_s)]
Xijk <- Xijk %>% dplyr::select(importer, sector, everything())
xijkCFS$sector <- df$sector[1:(n_sec*n_s)]
xijkCFS$importer <- df$importer[1:(n_sec*n_s)]
xijkCFS <- xijkCFS %>% dplyr::select(importer, sector, everything())
################################################################################
#The sum over states of X_{ij,k} should sum up to X_{US,US,k}^{WIOD}.
# Check for all  k
for(k in 1:n_sec) {
#If the sum over states does not coincide with WIOD up to a small difference (tolerance level), the loop stops and prints an error message.
  if(abs(sum(Xijk[Xijk$sector==k,3:52])/X_USA_USA_k[k] -1)>epsilon) 
    stop(paste("states' exports to states != US WIOD exports to US for year", yr))
}
################################################################################
#Indicate year. Organize and rename columns.
Step2 <- Xijk 
Step2 <- Step2 %>%
  mutate(year = yr) %>%
  dplyr::select(year, importer, sector, everything())
colnames(Step2)[4:dim(Step2)[2]] = cfs_base$importer[1:n_s]

  #Exporting
write.table(Step2, file = paste("1-Intermediate_Processed_Data//state_cfs_step_", yr,".csv", sep=""), sep = ",", row.names = FALSE)
}
```


